Data Description & Context: Parkinson’s Disease (PD) is a degenerative neurological disorder marked by decreased dopamine levels in the brain. It manifests itself through a deterioration of movement, including the presence of tremors and stiffness. There is commonly a marked effect on speech, including dysarthria (difficulty articulating sounds), hypophonia (lowered volume), and monotone (reduced pitch range). Additionally, cognitive impairments and changes in mood can occur, and risk of dementia is increased. Traditional diagnosis of Parkinson’s Disease involves a clinician taking a neurological history of the patient and observing motor skills in various situations. Since there is no definitive laboratory test to diagnose PD, diagnosis is often difficult, particularly in the early stages when motor effects are not yet severe. Monitoring progression of the disease over time requires repeated clinic visits by the patient. An effective screening process, particularly one that doesn’t require a clinic visit, would be beneficial. Since PD patients exhibit characteristic vocal features, voice recordings are a useful and non-invasive tool for diagnosis. If machine learning algorithms could be applied to a voice recording dataset to accurately diagnosis PD, this would be an effective screening step prior to an appointment with a clinician
name - ASCII subject name and recording number,
MDVP:Fo(Hz) - Average vocal fundamental frequency
MDVP:Fhi(Hz) - Maximum vocal fundamental frequency
MDVP:Flo(Hz) - Minimum vocal fundamental frequency
MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several measures of variation in fundamental frequency
MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude
NHR,HNR - Two measures of ratio of noise to tonal components in the voice
status - Health status of the subject (one) - Parkinson's, (zero) - healthy
RPDE,D2 - Two nonlinear dynamical complexity measures
DFA - Signal fractal scaling exponent
spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation
Exploratory Data Analysis Supervised Learning Ensemble Learning
Goal is to classify the patients into the respective labels using the attributes from their voice recordings
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # matplotlib.pyplot plots data
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import sklearn.tree as DecisionTreeClassifier
from scipy.stats import zscore
from sklearn import metrics
Data = pd.read_csv("Data-Parkinsons")
Data.head()
Data.shape
Data.columns
Data.info()
Data.describe().transpose
'MDVP.Fhi(Hz)', 'MDVP.Flo(Hz)', 'MDVP.Fo(Hz)','MDVP.PPQ', 'MDVP:RAP', 'MDVP:PPQ' , 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR'
plt.figure(figsize=(12,10))
Data.iloc[:,0:9].boxplot()
plt.show()
plt.figure(figsize=(12,10))
Data.iloc[:,9:14].boxplot()
plt.show()
plt.figure(figsize=(12,10))
Data.iloc[:,14:20].boxplot()
plt.show()
plt.figure(figsize=(12,10))
Data.iloc[:,20:25].boxplot()
plt.show()
# columns to observe 9 for outliers
# 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)',
# 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'HNR', 'Spread1' attributes.
fig, ax =plt.subplots(1,9, figsize = (18,5))
sns.boxplot(Data['MDVP:Fhi(Hz)'], ax=ax[0])
sns.boxplot(Data['MDVP:Flo(Hz)'], ax =ax[1])
sns.boxplot(Data['MDVP:Shimmer'], ax = ax[2])
sns.boxplot(Data['MDVP:Shimmer(dB)'], ax = ax[3])
sns.boxplot(Data['Shimmer:APQ3'], ax = ax[4])
sns.boxplot(Data['Shimmer:APQ5'], ax = ax[5])
sns.boxplot(Data['MDVP:APQ'], ax = ax[6])
sns.boxplot(Data['HNR'], ax = ax[7])
sns.boxplot(Data['spread1'], ax = ax[8])
plt.show()
'MDVP.Fhi(Hz)', 'MDVP.Flo(Hz)', 'MDVP.Fo(Hz)','MDVP.PPQ', 'MDVP:RAP', 'MDVP:PPQ' , 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR'
Data.hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));
cols = ['MDVP:Fo(Hz)', 'MDVP:Jitter(%)','MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'HNR','RPDE', 'DFA',
'spread1', 'spread2', 'D2', 'PPE', 'status']
sns.pairplot(Data[cols])
sns.pairplot(Data)
x = Data.drop(['status','name'],axis=1)
y = Data['status']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
Data.shape,x_train.shape,x_test.shape
XScaled = x.apply(zscore) # convert all attributes to Z scale
XScaled.describe()
x_train, x_test, y_train, y_test = train_test_split(XScaled, y, test_size=0.3, random_state=42)
Data.shape,x_train.shape,x_test.shape
# Fit the model on train
clf_lr = LogisticRegression(solver="liblinear")
clf_lr.fit(x_train, y_train)
#predict on test
y_predict = clf_lr.predict(x_test)
coef_df = pd.DataFrame(clf_lr.coef_)
coef_df['intercept'] = clf_lr.intercept_
print(coef_df)
model_score = clf_lr.score(x_test, y_test)
print("Train Accuracy with Logistic Regression = ",clf_lr.score(x_train, y_train) )
print("Test Accuracy with Logistic Regression = ",model_score)
from sklearn.neighbors import KNeighborsClassifier
clf_NNH = KNeighborsClassifier(n_neighbors= 7 , weights = 'distance' )
clf_NNH.fit(x_train,y_train)
predicted_labels = clf_NNH.predict(x_test)
print("Train Accuracy with KNN = ",clf_NNH.score(x_train, y_train) )
print("Test Accuracy with KNN = ",clf_NNH.score(x_test, y_test))
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()
clf_NB.fit(x_train, y_train)
#predict on test
y_predict_NB = clf_NB.predict(x_test)
model_score_NB = clf_NB.score(x_test, y_test)
print("Train Accuracy with Naive Bayes = ",clf_NB.score(x_train, y_train) )
print("Test Accuracy with Naive Bayes = ",model_score_NB)
from sklearn import svm
clf_svm = svm.SVC(gamma=0.025, C=3)
clf_svm.fit(x_train , y_train)
y_pred = clf_svm.predict(x_test)
print("Train Accuracy %0.2f " % (clf_svm.score(x_train, y_train)))
print("Accuracy %0.2f " % (clf_svm.score(x_test, y_test)))
print("Train Accuracy with Logistic Regression = ",clf_lr.score(x_train, y_train) )
print("Test Accuracy with Logistic Regression = ",model_score)
print("Train Accuracy with KNN = ",clf_NNH.score(x_train, y_train) )
print("Test Accuracy with KNN = ",clf_NNH.score(x_test, y_test))
print("Train Accuracy with Naive Bayes = ",clf_NB.score(x_train, y_train) )
print("Test Accuracy with Naive Bayes = ",model_score_NB)
print("Train Accuracy in SVM %0.2f " % (clf_svm.score(x_train, y_train)))
print("Test Accuracy in SVM %0.2f " % (clf_svm.score(x_test, y_test)))
from mlxtend.classifier import StackingClassifier
from sklearn import model_selection
from mlxtend.classifier import StackingCVClassifier
sclf = StackingCVClassifier(classifiers=[clf_NNH,clf_NB,clf_svm],
meta_classifier=clf_lr,random_state=42)
#sclf = StackingClassifier(classifiers=[clf_NNH,clf_NB,clf_svm],
# meta_classifier=clf_lr)
print('3-fold cross validation:\n')
for clf, label in zip([clf_NNH,clf_NB,clf_svm, sclf],
['KNN',
'Naive Bayes',
'SVM',
'StackingClassifier']):
scores = model_selection.cross_val_score(clf, x, y,
cv=3, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
Training accuracy = 0.90, Test accuracy = 0.92
sclf.fit(x_train,y_train)
print("Train Accuracy %0.2f " % (sclf.score(x_train, y_train)))
print("Accuracy %0.2f " % (sclf.score(x_test, y_test)))
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
rfcl = rfcl.fit(x_train, y_train)
y_predict = rfcl.predict(x_test)
print("Accuracy of RandomForest model = ",rfcl.score(x_test, y_test))
cm=confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["0","1"]],
columns = [i for i in ["0","1"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g');
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dTree, n_estimators=50,random_state=1)
#bgcl = BaggingClassifier(n_estimators=50,random_state=1)
bgcl = bgcl.fit(x_train, y_train)
from sklearn.metrics import confusion_matrix
y_predict = bgcl.predict(x_test)
print("Bagging classifier accuracy = ",bgcl.score(x_test , y_test))
cm=confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["0","1"]],
columns = [i for i in ["0","1"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g');
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(x_train, y_train)
y_predict = abcl.predict(x_test)
print("Accuracy of AdaBoost Classifier = ",abcl.score(x_test , y_test))
cm=confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["0","1"]],
columns = [i for i in ["0","1"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g');
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gbcl = gbcl.fit(x_train, y_train)
y_predict = gbcl.predict(x_test)
print("Accuracy of GradientBoost classifier = ",gbcl.score(x_test, y_test))
cm=confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["0","1"]],
columns = [i for i in ["0","1"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g');
print("Train Accuracy of meta-classifier %0.2f " % (sclf.score(x_train, y_train)))
print("Test Accuracy of meta-classifier %0.2f " % (sclf.score(x_test, y_test)))
print("Train Accuracy of RandomForest model = ",rfcl.score(x_train, y_train))
print("Test Accuracy of RandomForest model = ",rfcl.score(x_test, y_test))
print("Train Accuracy of Bagging classifier = ",bgcl.score(x_train, y_train))
print("Test accuracy of Bagging classifier = ",bgcl.score(x_test , y_test))
print("Train Accuracy of AdaBoost Classifier =",abcl.score(x_train , y_train))
print("Test Accuracy of AdaBoost Classifier =",abcl.score(x_test , y_test))
print("Train Accuracy of GradientBoost classifier =",gbcl.score(x_train, y_train))
print("Test Accuracy of GradientBoost classifier =",gbcl.score(x_test, y_test))